# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
print(__version__)
import plotly.graph_objects as go
import cufflinks as cf
init_notebook_mode(connected=True)
cf.go_offline()
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.figure_factory as ff
4.14.3
#Read the data set in to data frame
pr_loan = pd.read_csv('prosperLoanData.csv')
#Make copy of the data
df_loan = pr_loan.copy()
#Check Properties
df_loan.head()
| ListingKey | ListingNumber | ListingCreationDate | CreditGrade | Term | LoanStatus | ClosedDate | BorrowerAPR | BorrowerRate | LenderYield | ... | LP_ServiceFees | LP_CollectionFees | LP_GrossPrincipalLoss | LP_NetPrincipalLoss | LP_NonPrincipalRecoverypayments | PercentFunded | Recommendations | InvestmentFromFriendsCount | InvestmentFromFriendsAmount | Investors | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1021339766868145413AB3B | 193129 | 2007-08-26 19:09:29.263000000 | C | 36 | Completed | 2009-08-14 00:00:00 | 0.16516 | 0.1580 | 0.1380 | ... | -133.18 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 258 |
| 1 | 10273602499503308B223C1 | 1209647 | 2014-02-27 08:28:07.900000000 | NaN | 36 | Current | NaN | 0.12016 | 0.0920 | 0.0820 | ... | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 1 |
| 2 | 0EE9337825851032864889A | 81716 | 2007-01-05 15:00:47.090000000 | HR | 36 | Completed | 2009-12-17 00:00:00 | 0.28269 | 0.2750 | 0.2400 | ... | -24.20 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 41 |
| 3 | 0EF5356002482715299901A | 658116 | 2012-10-22 11:02:35.010000000 | NaN | 36 | Current | NaN | 0.12528 | 0.0974 | 0.0874 | ... | -108.01 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 158 |
| 4 | 0F023589499656230C5E3E2 | 909464 | 2013-09-14 18:38:39.097000000 | NaN | 36 | Current | NaN | 0.24614 | 0.2085 | 0.1985 | ... | -60.27 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 20 |
5 rows × 81 columns
df_loan.shape
(113937, 81)
df_loan.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 113937 entries, 0 to 113936 Data columns (total 81 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ListingKey 113937 non-null object 1 ListingNumber 113937 non-null int64 2 ListingCreationDate 113937 non-null object 3 CreditGrade 28953 non-null object 4 Term 113937 non-null int64 5 LoanStatus 113937 non-null object 6 ClosedDate 55089 non-null object 7 BorrowerAPR 113912 non-null float64 8 BorrowerRate 113937 non-null float64 9 LenderYield 113937 non-null float64 10 EstimatedEffectiveYield 84853 non-null float64 11 EstimatedLoss 84853 non-null float64 12 EstimatedReturn 84853 non-null float64 13 ProsperRating (numeric) 84853 non-null float64 14 ProsperRating (Alpha) 84853 non-null object 15 ProsperScore 84853 non-null float64 16 ListingCategory (numeric) 113937 non-null int64 17 BorrowerState 108422 non-null object 18 Occupation 110349 non-null object 19 EmploymentStatus 111682 non-null object 20 EmploymentStatusDuration 106312 non-null float64 21 IsBorrowerHomeowner 113937 non-null bool 22 CurrentlyInGroup 113937 non-null bool 23 GroupKey 13341 non-null object 24 DateCreditPulled 113937 non-null object 25 CreditScoreRangeLower 113346 non-null float64 26 CreditScoreRangeUpper 113346 non-null float64 27 FirstRecordedCreditLine 113240 non-null object 28 CurrentCreditLines 106333 non-null float64 29 OpenCreditLines 106333 non-null float64 30 TotalCreditLinespast7years 113240 non-null float64 31 OpenRevolvingAccounts 113937 non-null int64 32 OpenRevolvingMonthlyPayment 113937 non-null float64 33 InquiriesLast6Months 113240 non-null float64 34 TotalInquiries 112778 non-null float64 35 CurrentDelinquencies 113240 non-null float64 36 AmountDelinquent 106315 non-null float64 37 DelinquenciesLast7Years 112947 non-null float64 38 PublicRecordsLast10Years 113240 non-null float64 39 PublicRecordsLast12Months 106333 non-null float64 40 RevolvingCreditBalance 106333 non-null float64 41 BankcardUtilization 106333 non-null float64 42 AvailableBankcardCredit 106393 non-null float64 43 TotalTrades 106393 non-null float64 44 TradesNeverDelinquent (percentage) 106393 non-null float64 45 TradesOpenedLast6Months 106393 non-null float64 46 DebtToIncomeRatio 105383 non-null float64 47 IncomeRange 113937 non-null object 48 IncomeVerifiable 113937 non-null bool 49 StatedMonthlyIncome 113937 non-null float64 50 LoanKey 113937 non-null object 51 TotalProsperLoans 22085 non-null float64 52 TotalProsperPaymentsBilled 22085 non-null float64 53 OnTimeProsperPayments 22085 non-null float64 54 ProsperPaymentsLessThanOneMonthLate 22085 non-null float64 55 ProsperPaymentsOneMonthPlusLate 22085 non-null float64 56 ProsperPrincipalBorrowed 22085 non-null float64 57 ProsperPrincipalOutstanding 22085 non-null float64 58 ScorexChangeAtTimeOfListing 18928 non-null float64 59 LoanCurrentDaysDelinquent 113937 non-null int64 60 LoanFirstDefaultedCycleNumber 16952 non-null float64 61 LoanMonthsSinceOrigination 113937 non-null int64 62 LoanNumber 113937 non-null int64 63 LoanOriginalAmount 113937 non-null int64 64 LoanOriginationDate 113937 non-null object 65 LoanOriginationQuarter 113937 non-null object 66 MemberKey 113937 non-null object 67 MonthlyLoanPayment 113937 non-null float64 68 LP_CustomerPayments 113937 non-null float64 69 LP_CustomerPrincipalPayments 113937 non-null float64 70 LP_InterestandFees 113937 non-null float64 71 LP_ServiceFees 113937 non-null float64 72 LP_CollectionFees 113937 non-null float64 73 LP_GrossPrincipalLoss 113937 non-null float64 74 LP_NetPrincipalLoss 113937 non-null float64 75 LP_NonPrincipalRecoverypayments 113937 non-null float64 76 PercentFunded 113937 non-null float64 77 Recommendations 113937 non-null int64 78 InvestmentFromFriendsCount 113937 non-null int64 79 InvestmentFromFriendsAmount 113937 non-null float64 80 Investors 113937 non-null int64 dtypes: bool(3), float64(50), int64(11), object(17) memory usage: 68.1+ MB
df_loan.describe()
| ListingNumber | Term | BorrowerAPR | BorrowerRate | LenderYield | EstimatedEffectiveYield | EstimatedLoss | EstimatedReturn | ProsperRating (numeric) | ProsperScore | ... | LP_ServiceFees | LP_CollectionFees | LP_GrossPrincipalLoss | LP_NetPrincipalLoss | LP_NonPrincipalRecoverypayments | PercentFunded | Recommendations | InvestmentFromFriendsCount | InvestmentFromFriendsAmount | Investors | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.139370e+05 | 113937.000000 | 113912.000000 | 113937.000000 | 113937.000000 | 84853.000000 | 84853.000000 | 84853.000000 | 84853.000000 | 84853.000000 | ... | 113937.000000 | 113937.000000 | 113937.000000 | 113937.000000 | 113937.000000 | 113937.000000 | 113937.000000 | 113937.000000 | 113937.000000 | 113937.000000 |
| mean | 6.278857e+05 | 40.830248 | 0.218828 | 0.192764 | 0.182701 | 0.168661 | 0.080306 | 0.096068 | 4.072243 | 5.950067 | ... | -54.725641 | -14.242698 | 700.446342 | 681.420499 | 25.142686 | 0.998584 | 0.048027 | 0.023460 | 16.550751 | 80.475228 |
| std | 3.280762e+05 | 10.436212 | 0.080364 | 0.074818 | 0.074516 | 0.068467 | 0.046764 | 0.030403 | 1.673227 | 2.376501 | ... | 60.675425 | 109.232758 | 2388.513831 | 2357.167068 | 275.657937 | 0.017919 | 0.332353 | 0.232412 | 294.545422 | 103.239020 |
| min | 4.000000e+00 | 12.000000 | 0.006530 | 0.000000 | -0.010000 | -0.182700 | 0.004900 | -0.182700 | 1.000000 | 1.000000 | ... | -664.870000 | -9274.750000 | -94.200000 | -954.550000 | 0.000000 | 0.700000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| 25% | 4.009190e+05 | 36.000000 | 0.156290 | 0.134000 | 0.124200 | 0.115670 | 0.042400 | 0.074080 | 3.000000 | 4.000000 | ... | -73.180000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 |
| 50% | 6.005540e+05 | 36.000000 | 0.209760 | 0.184000 | 0.173000 | 0.161500 | 0.072400 | 0.091700 | 4.000000 | 6.000000 | ... | -34.440000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 44.000000 |
| 75% | 8.926340e+05 | 36.000000 | 0.283810 | 0.250000 | 0.240000 | 0.224300 | 0.112000 | 0.116600 | 5.000000 | 8.000000 | ... | -13.920000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 115.000000 |
| max | 1.255725e+06 | 60.000000 | 0.512290 | 0.497500 | 0.492500 | 0.319900 | 0.366000 | 0.283700 | 7.000000 | 11.000000 | ... | 32.060000 | 0.000000 | 25000.000000 | 25000.000000 | 21117.900000 | 1.012500 | 39.000000 | 33.000000 | 25000.000000 | 1189.000000 |
8 rows × 61 columns
#Check for null values
df_loan.isnull().sum()
ListingKey 0
ListingNumber 0
ListingCreationDate 0
CreditGrade 84984
Term 0
...
PercentFunded 0
Recommendations 0
InvestmentFromFriendsCount 0
InvestmentFromFriendsAmount 0
Investors 0
Length: 81, dtype: int64
#Check for percentage of missing values in columns
df_loan.isnull().sum()/pr_loan.shape[0] * 100.00
ListingKey 0.000000
ListingNumber 0.000000
ListingCreationDate 0.000000
CreditGrade 74.588588
Term 0.000000
...
PercentFunded 0.000000
Recommendations 0.000000
InvestmentFromFriendsCount 0.000000
InvestmentFromFriendsAmount 0.000000
Investors 0.000000
Length: 81, dtype: float64
#Check for percentage of missing values in columns
def nulls() :
y = pr_loan.isnull().sum()
for i in range(len(y)) :
if y[i] != 0 :
x= y[i]
col = pr_loan.columns[i]
z= (x/pr_loan.shape[0])*100
print(str(i) + " ---> " + col + "/////// " + str(round(z,2)) +" %")
nulls()
3 ---> CreditGrade/////// 74.59 % 6 ---> ClosedDate/////// 51.65 % 7 ---> BorrowerAPR/////// 0.02 % 10 ---> EstimatedEffectiveYield/////// 25.53 % 11 ---> EstimatedLoss/////// 25.53 % 12 ---> EstimatedReturn/////// 25.53 % 13 ---> ProsperRating (numeric)/////// 25.53 % 14 ---> ProsperRating (Alpha)/////// 25.53 % 15 ---> ProsperScore/////// 25.53 % 17 ---> BorrowerState/////// 4.84 % 18 ---> Occupation/////// 3.15 % 19 ---> EmploymentStatus/////// 1.98 % 20 ---> EmploymentStatusDuration/////// 6.69 % 23 ---> GroupKey/////// 88.29 % 25 ---> CreditScoreRangeLower/////// 0.52 % 26 ---> CreditScoreRangeUpper/////// 0.52 % 27 ---> FirstRecordedCreditLine/////// 0.61 % 28 ---> CurrentCreditLines/////// 6.67 % 29 ---> OpenCreditLines/////// 6.67 % 30 ---> TotalCreditLinespast7years/////// 0.61 % 33 ---> InquiriesLast6Months/////// 0.61 % 34 ---> TotalInquiries/////// 1.02 % 35 ---> CurrentDelinquencies/////// 0.61 % 36 ---> AmountDelinquent/////// 6.69 % 37 ---> DelinquenciesLast7Years/////// 0.87 % 38 ---> PublicRecordsLast10Years/////// 0.61 % 39 ---> PublicRecordsLast12Months/////// 6.67 % 40 ---> RevolvingCreditBalance/////// 6.67 % 41 ---> BankcardUtilization/////// 6.67 % 42 ---> AvailableBankcardCredit/////// 6.62 % 43 ---> TotalTrades/////// 6.62 % 44 ---> TradesNeverDelinquent (percentage)/////// 6.62 % 45 ---> TradesOpenedLast6Months/////// 6.62 % 46 ---> DebtToIncomeRatio/////// 7.51 % 51 ---> TotalProsperLoans/////// 80.62 % 52 ---> TotalProsperPaymentsBilled/////// 80.62 % 53 ---> OnTimeProsperPayments/////// 80.62 % 54 ---> ProsperPaymentsLessThanOneMonthLate/////// 80.62 % 55 ---> ProsperPaymentsOneMonthPlusLate/////// 80.62 % 56 ---> ProsperPrincipalBorrowed/////// 80.62 % 57 ---> ProsperPrincipalOutstanding/////// 80.62 % 58 ---> ScorexChangeAtTimeOfListing/////// 83.39 % 60 ---> LoanFirstDefaultedCycleNumber/////// 85.12 %
11 of the columns have more than 50% of missing values. The next move is to drop these 11 columns.
df_loan.drop(columns=['CreditGrade', 'ClosedDate', 'TotalProsperLoans', 'TotalProsperPaymentsBilled',
'OnTimeProsperPayments', 'ProsperPaymentsLessThanOneMonthLate', 'ProsperPaymentsOneMonthPlusLate',
'ProsperPrincipalBorrowed', 'ProsperPrincipalOutstanding', 'ScorexChangeAtTimeOfListing',
'LoanFirstDefaultedCycleNumber'],inplace= True )
#Check the null values again.
nulls()
3 ---> CreditGrade/////// 74.59 % 6 ---> ClosedDate/////// 51.65 % 7 ---> BorrowerAPR/////// 0.02 % 10 ---> EstimatedEffectiveYield/////// 25.53 % 11 ---> EstimatedLoss/////// 25.53 % 12 ---> EstimatedReturn/////// 25.53 % 13 ---> ProsperRating (numeric)/////// 25.53 % 14 ---> ProsperRating (Alpha)/////// 25.53 % 15 ---> ProsperScore/////// 25.53 % 17 ---> BorrowerState/////// 4.84 % 18 ---> Occupation/////// 3.15 % 19 ---> EmploymentStatus/////// 1.98 % 20 ---> EmploymentStatusDuration/////// 6.69 % 23 ---> GroupKey/////// 88.29 % 25 ---> CreditScoreRangeLower/////// 0.52 % 26 ---> CreditScoreRangeUpper/////// 0.52 % 27 ---> FirstRecordedCreditLine/////// 0.61 % 28 ---> CurrentCreditLines/////// 6.67 % 29 ---> OpenCreditLines/////// 6.67 % 30 ---> TotalCreditLinespast7years/////// 0.61 % 33 ---> InquiriesLast6Months/////// 0.61 % 34 ---> TotalInquiries/////// 1.02 % 35 ---> CurrentDelinquencies/////// 0.61 % 36 ---> AmountDelinquent/////// 6.69 % 37 ---> DelinquenciesLast7Years/////// 0.87 % 38 ---> PublicRecordsLast10Years/////// 0.61 % 39 ---> PublicRecordsLast12Months/////// 6.67 % 40 ---> RevolvingCreditBalance/////// 6.67 % 41 ---> BankcardUtilization/////// 6.67 % 42 ---> AvailableBankcardCredit/////// 6.62 % 43 ---> TotalTrades/////// 6.62 % 44 ---> TradesNeverDelinquent (percentage)/////// 6.62 % 45 ---> TradesOpenedLast6Months/////// 6.62 % 46 ---> DebtToIncomeRatio/////// 7.51 % 51 ---> TotalProsperLoans/////// 80.62 % 52 ---> TotalProsperPaymentsBilled/////// 80.62 % 53 ---> OnTimeProsperPayments/////// 80.62 % 54 ---> ProsperPaymentsLessThanOneMonthLate/////// 80.62 % 55 ---> ProsperPaymentsOneMonthPlusLate/////// 80.62 % 56 ---> ProsperPrincipalBorrowed/////// 80.62 % 57 ---> ProsperPrincipalOutstanding/////// 80.62 % 58 ---> ScorexChangeAtTimeOfListing/////// 83.39 % 60 ---> LoanFirstDefaultedCycleNumber/////// 85.12 %
There are several variables in this large data, but we will streamline our searchlight to a few of them.
They are:
'ListingNumber', 'ListingCreationDate', 'LoanOriginalAmount', 'LoanStatus', 'ListingCategory (numeric)', 'BorrowerState', 'BorrowerAPR', 'BorrowerRate', 'StatedMonthlyIncome', 'ProsperRating (Alpha)', 'Occupation', 'Term', 'EmploymentStatus', 'TotalInquiries', 'DebtToIncomeRatio', 'MonthlyLoanPayment', 'TotalTrades', 'Investors'
Information on the selected variables
1. ListingNumber == The number that uniquely identifies the listing to the public as displayed on the website.
2. ListingCreationDate == The date the listing was created.
3. LoanOriginalAmount == The origination amount of the loan.
4. LoanStatus == The current status of the loan: Cancelled, Chargedoff, Completed, Current, Defaulted, FinalPaymentInProgress, PastDue. The PastDue status will be accompanied by a delinquency bucket.
5. ListingCategory (numeric) == The category of the listing that the borrower selected when posting their listing: 0 - Not Available, 1 - Debt Consolidation, 2 - Home Improvement, 3 - Business, 4 - Personal Loan, 5 - Student Use, 6 - Auto, 7- Other, 8 - Baby&Adoption, 9 - Boat, 10 - Cosmetic Procedure, 11 - Engagement Ring, 12 - Green Loans, 13 - Household Expenses, 14 - Large Purchases, 15 - Medical/Dental, 16 - Motorcycle, 17 - RV, 18 - Taxes, 19 - Vacation, 20 - Wedding Loans
6. BorrowerState == The two letter abbreviation of the state of the address of the borrower at the time the Listing was created.
7. BorrowerAPR == The Borrower's Annual Percentage Rate (APR) for the loan.
8. BorrowerRate == The Borrower's interest rate for this loan.
9. StatedMonthlyIncome == The monthly income the borrower stated at the time the listing was created.
10. ProsperRating (Alpha) == The Prosper Rating assigned at the time the listing was created between AA - HR. Applicable for loans originated after July 2009.
11. Occupation == The Occupation selected by the Borrower at the time they created the listing.
12. Term == The length of the loan expressed in months.
13. EmploymentStatus == The employment status of the borrower at the time they posted the listing.
14. TotalInquiries == Total number of inquiries at the time the credit profile was pulled.
15. DebtToIncomeRatio == The debt to income ratio of the borrower at the time the credit profile was pulled. This value is Null if the debt to income ratio is not available. This value is capped at 10.01 (any debt to income ratio larger than 1000% will be returned as 1001%).
16. MonthlyLoanPayment == The scheduled monthly loan payment.
17. TotalTrades == Number of trade lines ever opened at the time the credit profile was pulled.
18. Investors == The number of investors that funded the loan.
#Select the variables into a dataframe
col_sub = ['ListingNumber','ListingCreationDate','LoanOriginalAmount','LoanStatus','ListingCategory (numeric)','BorrowerState','BorrowerAPR','BorrowerRate','StatedMonthlyIncome', 'ProsperRating (Alpha)',
'Occupation','Term','EmploymentStatus','TotalInquiries','DebtToIncomeRatio','MonthlyLoanPayment','TotalTrades','Investors']
loan = df_loan[col_sub]
#Check the properties of the selected loan dataframe
loan.shape
(113937, 18)
loan.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 113937 entries, 0 to 113936 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ListingNumber 113937 non-null int64 1 ListingCreationDate 113937 non-null object 2 LoanOriginalAmount 113937 non-null int64 3 LoanStatus 113937 non-null object 4 ListingCategory (numeric) 113937 non-null int64 5 BorrowerState 108422 non-null object 6 BorrowerAPR 113912 non-null float64 7 BorrowerRate 113937 non-null float64 8 StatedMonthlyIncome 113937 non-null float64 9 ProsperRating (Alpha) 84853 non-null object 10 Occupation 110349 non-null object 11 Term 113937 non-null int64 12 EmploymentStatus 111682 non-null object 13 TotalInquiries 112778 non-null float64 14 DebtToIncomeRatio 105383 non-null float64 15 MonthlyLoanPayment 113937 non-null float64 16 TotalTrades 106393 non-null float64 17 Investors 113937 non-null int64 dtypes: float64(7), int64(5), object(6) memory usage: 15.6+ MB
loan.describe()
| ListingNumber | LoanOriginalAmount | ListingCategory (numeric) | BorrowerAPR | BorrowerRate | StatedMonthlyIncome | Term | TotalInquiries | DebtToIncomeRatio | MonthlyLoanPayment | TotalTrades | Investors | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.139370e+05 | 113937.00000 | 113937.000000 | 113912.000000 | 113937.000000 | 1.139370e+05 | 113937.000000 | 112778.000000 | 105383.000000 | 113937.000000 | 106393.000000 | 113937.000000 |
| mean | 6.278857e+05 | 8337.01385 | 2.774209 | 0.218828 | 0.192764 | 5.608026e+03 | 40.830248 | 5.584405 | 0.275947 | 272.475783 | 23.230034 | 80.475228 |
| std | 3.280762e+05 | 6245.80058 | 3.996797 | 0.080364 | 0.074818 | 7.478497e+03 | 10.436212 | 6.429946 | 0.551759 | 192.697812 | 11.871311 | 103.239020 |
| min | 4.000000e+00 | 1000.00000 | 0.000000 | 0.006530 | 0.000000 | 0.000000e+00 | 12.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| 25% | 4.009190e+05 | 4000.00000 | 1.000000 | 0.156290 | 0.134000 | 3.200333e+03 | 36.000000 | 2.000000 | 0.140000 | 131.620000 | 15.000000 | 2.000000 |
| 50% | 6.005540e+05 | 6500.00000 | 1.000000 | 0.209760 | 0.184000 | 4.666667e+03 | 36.000000 | 4.000000 | 0.220000 | 217.740000 | 22.000000 | 44.000000 |
| 75% | 8.926340e+05 | 12000.00000 | 3.000000 | 0.283810 | 0.250000 | 6.825000e+03 | 36.000000 | 7.000000 | 0.320000 | 371.580000 | 30.000000 | 115.000000 |
| max | 1.255725e+06 | 35000.00000 | 20.000000 | 0.512290 | 0.497500 | 1.750003e+06 | 60.000000 | 379.000000 | 10.010000 | 2251.510000 | 126.000000 | 1189.000000 |
From the properties, further wrangling needs to be done.
ProsperRating (Alpha) is an integral variable for analysis and it has 25% null values. That is a significant number compared to other variables that do not have up to 10% null values. In order to effectively work with it, only rows where ProsperRating (Alpha) is not null will be selected.
#Select rows where ProsperRating (Alpha) is not null
loan = loan[loan['ProsperRating (Alpha)'].notnull()]
#Check the properties
loan.shape
(84853, 18)
loan.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 84853 entries, 1 to 113936 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ListingNumber 84853 non-null int64 1 ListingCreationDate 84853 non-null object 2 LoanOriginalAmount 84853 non-null int64 3 LoanStatus 84853 non-null object 4 ListingCategory (numeric) 84853 non-null int64 5 BorrowerState 84853 non-null object 6 BorrowerAPR 84853 non-null float64 7 BorrowerRate 84853 non-null float64 8 StatedMonthlyIncome 84853 non-null float64 9 ProsperRating (Alpha) 84853 non-null object 10 Occupation 83520 non-null object 11 Term 84853 non-null int64 12 EmploymentStatus 84853 non-null object 13 TotalInquiries 84853 non-null float64 14 DebtToIncomeRatio 77557 non-null float64 15 MonthlyLoanPayment 84853 non-null float64 16 TotalTrades 84853 non-null float64 17 Investors 84853 non-null int64 dtypes: float64(7), int64(5), object(6) memory usage: 12.3+ MB
There are still missing values for "Occupation" & "DebtToIncomeRatio".
To deal with these missing values, replace the null values in "Occupation" with "Unknown".
While for "DebtToIncomeRatio", replace with the mean value.
#Drop Duplicates first
loan = loan.drop_duplicates()
#Relace missing values for "Occupation" with "Unknown"
loan.Occupation = loan.Occupation.fillna('Unknown')
#Relace missing values for "DebtToIncomeRatio" with mean value
loan.DebtToIncomeRatio = loan.DebtToIncomeRatio.fillna(loan.DebtToIncomeRatio.mean())
loan.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 83982 entries, 1 to 113936 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ListingNumber 83982 non-null int64 1 ListingCreationDate 83982 non-null object 2 LoanOriginalAmount 83982 non-null int64 3 LoanStatus 83982 non-null object 4 ListingCategory (numeric) 83982 non-null int64 5 BorrowerState 83982 non-null object 6 BorrowerAPR 83982 non-null float64 7 BorrowerRate 83982 non-null float64 8 StatedMonthlyIncome 83982 non-null float64 9 ProsperRating (Alpha) 83982 non-null object 10 Occupation 83982 non-null object 11 Term 83982 non-null int64 12 EmploymentStatus 83982 non-null object 13 TotalInquiries 83982 non-null float64 14 DebtToIncomeRatio 83982 non-null float64 15 MonthlyLoanPayment 83982 non-null float64 16 TotalTrades 83982 non-null float64 17 Investors 83982 non-null int64 dtypes: float64(7), int64(5), object(6) memory usage: 12.2+ MB
The data type of "TotalInquiries" & "TotalTrades" need to be changed from float to int.
#Convert the datatypes from float to int
loan.TotalInquiries = loan.TotalInquiries.astype(int)
loan.TotalTrades = loan.TotalTrades.astype(int)
#Test
loan.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 83982 entries, 1 to 113936 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ListingNumber 83982 non-null int64 1 ListingCreationDate 83982 non-null object 2 LoanOriginalAmount 83982 non-null int64 3 LoanStatus 83982 non-null object 4 ListingCategory (numeric) 83982 non-null int64 5 BorrowerState 83982 non-null object 6 BorrowerAPR 83982 non-null float64 7 BorrowerRate 83982 non-null float64 8 StatedMonthlyIncome 83982 non-null float64 9 ProsperRating (Alpha) 83982 non-null object 10 Occupation 83982 non-null object 11 Term 83982 non-null int64 12 EmploymentStatus 83982 non-null object 13 TotalInquiries 83982 non-null int64 14 DebtToIncomeRatio 83982 non-null float64 15 MonthlyLoanPayment 83982 non-null float64 16 TotalTrades 83982 non-null int64 17 Investors 83982 non-null int64 dtypes: float64(5), int64(7), object(6) memory usage: 12.2+ MB
The "ListingCreationDate" also needs dissecting. The column can be split into "Day", "Month", "Year"
#Splitting "ListingCreationDate"
#First split into year & month
loan['Year'] = loan['ListingCreationDate'].apply(lambda x: x.split("-")[0]).astype(str)
loan['Month'] = loan['ListingCreationDate'].apply(lambda x: x.split("-")[1]).astype(str)
#Split into "Date_and_Time" first in order to easily extract the day.
loan['Day_and_Time'] = loan['ListingCreationDate'].apply(lambda x: x.split("-")[2]).astype(str)
#Split into Day & Time from 'Day_and_Time'
loan['Day'] = loan['Day_and_Time'].apply(lambda x: x.split(" ")[0]).astype(str)
loan['Time'] = loan['Day_and_Time'].apply(lambda x: x.split(" ")[1]).astype(str)
#Test
loan.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 83982 entries, 1 to 113936 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ListingNumber 83982 non-null int64 1 ListingCreationDate 83982 non-null object 2 LoanOriginalAmount 83982 non-null int64 3 LoanStatus 83982 non-null object 4 ListingCategory (numeric) 83982 non-null int64 5 BorrowerState 83982 non-null object 6 BorrowerAPR 83982 non-null float64 7 BorrowerRate 83982 non-null float64 8 StatedMonthlyIncome 83982 non-null float64 9 ProsperRating (Alpha) 83982 non-null object 10 Occupation 83982 non-null object 11 Term 83982 non-null int64 12 EmploymentStatus 83982 non-null object 13 TotalInquiries 83982 non-null int64 14 DebtToIncomeRatio 83982 non-null float64 15 MonthlyLoanPayment 83982 non-null float64 16 TotalTrades 83982 non-null int64 17 Investors 83982 non-null int64 18 Year 83982 non-null object 19 Month 83982 non-null object 20 Day_and_Time 83982 non-null object 21 Day 83982 non-null object 22 Time 83982 non-null object dtypes: float64(5), int64(7), object(11) memory usage: 15.4+ MB
#Drop the Day_and_Time & ListingCreationDate columns
loan.drop(columns=['Day_and_Time','ListingCreationDate'], inplace=True, axis=1)
loan.head()
| ListingNumber | LoanOriginalAmount | LoanStatus | ListingCategory (numeric) | BorrowerState | BorrowerAPR | BorrowerRate | StatedMonthlyIncome | ProsperRating (Alpha) | Occupation | ... | EmploymentStatus | TotalInquiries | DebtToIncomeRatio | MonthlyLoanPayment | TotalTrades | Investors | Year | Month | Day | Time | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1209647 | 10000 | Current | 2 | CO | 0.12016 | 0.0920 | 6125.000000 | A | Professional | ... | Employed | 5 | 0.18 | 318.93 | 29 | 1 | 2014 | 02 | 27 | 08:28:07.900000000 |
| 3 | 658116 | 10000 | Current | 16 | GA | 0.12528 | 0.0974 | 2875.000000 | A | Skilled Labor | ... | Employed | 1 | 0.15 | 321.45 | 26 | 158 | 2012 | 10 | 22 | 11:02:35.010000000 |
| 4 | 909464 | 15000 | Current | 2 | MN | 0.24614 | 0.2085 | 9583.333333 | D | Executive | ... | Employed | 9 | 0.26 | 563.97 | 39 | 20 | 2013 | 09 | 14 | 18:38:39.097000000 |
| 5 | 1074836 | 15000 | Current | 1 | NM | 0.15425 | 0.1314 | 8333.333333 | B | Professional | ... | Employed | 2 | 0.36 | 342.37 | 47 | 1 | 2013 | 12 | 14 | 08:26:37.093000000 |
| 6 | 750899 | 3000 | Current | 1 | KS | 0.31032 | 0.2712 | 2083.333333 | E | Sales - Retail | ... | Employed | 0 | 0.27 | 122.67 | 16 | 1 | 2013 | 04 | 12 | 09:52:56.147000000 |
5 rows × 21 columns
Change the values for 'Month' from numbers to names.
#Replacing numerical values of month with names
loan.Month.replace(['01','02','03','04','05','06','07','08','09','10','11','12'],
['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sept','Oct','Nov','Dec'],inplace = True)
#Test
loan['Month'].unique()
array(['Feb', 'Oct', 'Sept', 'Dec', 'Apr', 'May', 'Jul', 'Aug', 'Nov',
'Jan', 'Jun', 'Mar'], dtype=object)
#Test
loan.sample(10)
| ListingNumber | LoanOriginalAmount | LoanStatus | ListingCategory (numeric) | BorrowerState | BorrowerAPR | BorrowerRate | StatedMonthlyIncome | ProsperRating (Alpha) | Occupation | ... | EmploymentStatus | TotalInquiries | DebtToIncomeRatio | MonthlyLoanPayment | TotalTrades | Investors | Year | Month | Day | Time | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 105425 | 688663 | 15000 | Chargedoff | 3 | FL | 0.24764 | 0.2226 | 4091.333333 | C | Medical Technician | ... | Employed | 9 | 0.258692 | 416.50 | 23 | 210 | 2012 | Dec | 18 | 16:18:12.873000000 |
| 64469 | 547311 | 4000 | Current | 3 | UT | 0.34731 | 0.3073 | 4575.500000 | E | Teacher | ... | Employed | 19 | 0.160000 | 171.41 | 20 | 33 | 2011 | Dec | 28 | 14:09:41.570000000 |
| 38987 | 529453 | 4000 | Completed | 1 | MD | 0.35643 | 0.3199 | 8000.000000 | HR | Other | ... | Employed | 5 | 0.258692 | 174.20 | 43 | 40 | 2011 | Sept | 26 | 05:15:29.960000000 |
| 3802 | 990925 | 9000 | Current | 1 | GA | 0.09000 | 0.0766 | 4583.333333 | AA | Teacher | ... | Employed | 3 | 0.270000 | 280.62 | 38 | 49 | 2013 | Nov | 12 | 18:12:35.560000000 |
| 105514 | 787525 | 3500 | Current | 16 | CA | 0.26528 | 0.2272 | 7250.000000 | D | Civil Service | ... | Employed | 5 | 0.040000 | 134.97 | 31 | 54 | 2013 | May | 23 | 12:43:53.703000000 |
| 29025 | 732140 | 18988 | Current | 1 | CA | 0.11066 | 0.0735 | 15000.000000 | A | Other | ... | Employed | 15 | 0.270000 | 1646.00 | 25 | 293 | 2013 | Mar | 18 | 18:46:27.197000000 |
| 6638 | 515832 | 11365 | Current | 1 | NV | 0.20436 | 0.1899 | 7333.333333 | B | Engineer - Electrical | ... | Employed | 5 | 0.280000 | 294.74 | 49 | 171 | 2011 | Jul | 10 | 20:30:01.847000000 |
| 102415 | 562781 | 4000 | Chargedoff | 1 | MI | 0.28370 | 0.2452 | 1833.333333 | D | Landscaping | ... | Self-employed | 0 | 0.258692 | 158.03 | 8 | 1 | 2012 | Feb | 26 | 13:51:17.187000000 |
| 108237 | 662626 | 16000 | Current | 2 | NC | 0.15752 | 0.1346 | 5391.583333 | A | Other | ... | Employed | 3 | 0.340000 | 367.83 | 19 | 135 | 2012 | Oct | 30 | 05:02:54.340000000 |
| 15449 | 435574 | 2500 | Completed | 1 | MO | 0.12106 | 0.1000 | 2470.833333 | A | Teacher | ... | Full-time | 2 | 0.180000 | 80.67 | 4 | 139 | 2009 | Nov | 27 | 09:48:17.967000000 |
10 rows × 21 columns
loan.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 83982 entries, 1 to 113936 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ListingNumber 83982 non-null int64 1 LoanOriginalAmount 83982 non-null int64 2 LoanStatus 83982 non-null object 3 ListingCategory (numeric) 83982 non-null int64 4 BorrowerState 83982 non-null object 5 BorrowerAPR 83982 non-null float64 6 BorrowerRate 83982 non-null float64 7 StatedMonthlyIncome 83982 non-null float64 8 ProsperRating (Alpha) 83982 non-null object 9 Occupation 83982 non-null object 10 Term 83982 non-null int64 11 EmploymentStatus 83982 non-null object 12 TotalInquiries 83982 non-null int64 13 DebtToIncomeRatio 83982 non-null float64 14 MonthlyLoanPayment 83982 non-null float64 15 TotalTrades 83982 non-null int64 16 Investors 83982 non-null int64 17 Year 83982 non-null object 18 Month 83982 non-null object 19 Day 83982 non-null object 20 Time 83982 non-null object dtypes: float64(5), int64(7), object(9) memory usage: 14.1+ MB
The "ListingCategory (numeric)" needs to be adjusted and renamed.
#Creating a new column if they meet the category conditions
#Create an empty list
category= []
#Creat a loop which will iterate over all the rows in column 'listingCategory' and assign values in the list if the condition is meet
for row in loan['ListingCategory (numeric)']:
if row == 0: category.append('Not available')
if row == 1: category.append('Debt Consolidation')
if row == 2: category.append('Home Improvement')
if row == 3: category.append('Business')
if row == 4: category.append('Personal Loan')
if row == 5: category.append('Student Use')
if row == 6: category.append('Auto')
if row == 7: category.append('Other')
if row == 8: category.append('Baby&Adoption')
if row == 9: category.append('Boat')
if row == 10: category.append('Cosmetic Procedure')
if row == 11: category.append('Engagement Ring')
if row == 12: category.append('Green Loans')
if row == 13: category.append('Household Expenses')
if row == 14: category.append('Large Purchases')
if row == 15: category.append('Medical/Dental')
if row == 16: category.append('Motorcycle')
if row == 17: category.append('RV')
if row == 18: category.append('Taxes')
if row == 19: category.append('Vacation')
if row == 20: category.append('Wedding Loans')
# store the append values into a new column in the loan data frame
loan['Listing_Category'] = category
#Drop the previous column
loan = loan.drop('ListingCategory (numeric)',axis=1)
#Test
loan.sample(10)
| ListingNumber | LoanOriginalAmount | LoanStatus | BorrowerState | BorrowerAPR | BorrowerRate | StatedMonthlyIncome | ProsperRating (Alpha) | Occupation | Term | ... | TotalInquiries | DebtToIncomeRatio | MonthlyLoanPayment | TotalTrades | Investors | Year | Month | Day | Time | Listing_Category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 11146 | 1209806 | 25000 | Current | AK | 0.14446 | 0.1159 | 7500.000000 | A | Civil Service | 36 | ... | 5 | 0.27 | 825.47 | 26 | 1 | 2014 | Feb | 27 | 08:50:30.870000000 | Debt Consolidation |
| 70440 | 562022 | 25000 | Current | TX | 0.15784 | 0.1396 | 10416.666667 | A | Professional | 60 | ... | 4 | 0.16 | 581.19 | 34 | 19 | 2012 | Feb | 23 | 17:35:28.787000000 | Home Improvement |
| 95266 | 549185 | 7000 | Completed | NE | 0.07830 | 0.0749 | 5979.000000 | AA | Computer Programmer | 36 | ... | 5 | 0.10 | 217.71 | 11 | 151 | 2012 | Jan | 05 | 11:07:00.157000000 | Auto |
| 24859 | 989931 | 10000 | Current | NC | 0.18555 | 0.1620 | 2596.000000 | B | Other | 60 | ... | 8 | 0.38 | 244.24 | 11 | 1 | 2013 | Oct | 20 | 12:23:53.733000000 | Business |
| 85196 | 1235658 | 4000 | Current | PA | 0.21636 | 0.1790 | 2221.416667 | C | Teacher | 36 | ... | 9 | 0.30 | 144.41 | 25 | 28 | 2014 | Feb | 26 | 08:19:08.703000000 | Debt Consolidation |
| 46519 | 1097486 | 25000 | Current | AZ | 0.20217 | 0.1655 | 11250.000000 | C | Doctor | 36 | ... | 9 | 0.27 | 885.73 | 24 | 1 | 2013 | Dec | 26 | 15:35:41.463000000 | Debt Consolidation |
| 43331 | 625358 | 5000 | Current | MN | 0.17754 | 0.1414 | 13333.333333 | B | Other | 36 | ... | 5 | 0.19 | 171.23 | 32 | 90 | 2012 | Aug | 17 | 04:54:07.590000000 | Home Improvement |
| 108387 | 481775 | 7000 | Completed | MD | 0.29265 | 0.2575 | 11076.166667 | D | Professional | 36 | ... | 13 | 0.13 | 260.14 | 41 | 172 | 2010 | Oct | 27 | 11:05:13.883000000 | Debt Consolidation |
| 27903 | 450136 | 1500 | Chargedoff | SC | 0.22390 | 0.1990 | 3333.333333 | C | Engineer - Electrical | 36 | ... | 1 | 0.16 | 55.67 | 27 | 38 | 2010 | Mar | 11 | 17:58:17.263000000 | Debt Consolidation |
| 100374 | 744914 | 15000 | Current | PA | 0.26877 | 0.2432 | 4166.666667 | D | Skilled Labor | 60 | ... | 2 | 0.25 | 434.31 | 27 | 192 | 2013 | Apr | 07 | 07:27:58.317000000 | Debt Consolidation |
10 rows × 21 columns
loan.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 83982 entries, 1 to 113936 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ListingNumber 83982 non-null int64 1 LoanOriginalAmount 83982 non-null int64 2 LoanStatus 83982 non-null object 3 BorrowerState 83982 non-null object 4 BorrowerAPR 83982 non-null float64 5 BorrowerRate 83982 non-null float64 6 StatedMonthlyIncome 83982 non-null float64 7 ProsperRating (Alpha) 83982 non-null object 8 Occupation 83982 non-null object 9 Term 83982 non-null int64 10 EmploymentStatus 83982 non-null object 11 TotalInquiries 83982 non-null int64 12 DebtToIncomeRatio 83982 non-null float64 13 MonthlyLoanPayment 83982 non-null float64 14 TotalTrades 83982 non-null int64 15 Investors 83982 non-null int64 16 Year 83982 non-null object 17 Month 83982 non-null object 18 Day 83982 non-null object 19 Time 83982 non-null object 20 Listing_Category 83982 non-null object dtypes: float64(5), int64(6), object(10) memory usage: 14.1+ MB
loan.shape
(83982, 21)
The working dataset of contains 83982 loans with 21 features was sieved out from 113937 loans and 81 features
The main feature(s) include the Borrower's Annual Percentage Rate (APR), Borrower Rate, & Prosper Rating
Other features I expect to help support are: Occupation, Term, Employment Status & Debt To Income Ratio
#Create a function that will plot the histogram and distribution for different variables.
def plothist(info):
fig = px.histogram(data_frame=loan, x=info, nbins=20,
labels={info: info.upper()},
title = 'HISTOGRAM SHOWING ' + info.upper())
fig.show()
plothist('Year')
plothist('Month')
plothist('Day')
The distribution for the Year indicates that most of the loans were collected in 2013. I don't have enough information to give a suggestion to why 2013 experienced an overwhelming number of loan collection.
The distribution for the Month shows that January, October, & December are the three most active months for loan collection. These months are either toward the end of a year or the beginning of a new year. It can be suggested that people like to borrow loans in those periods to prepare for the festive season as well as prepare for what the new year holds in terms of personal projects, school fees, & other expenses.
#Now, check for the distribution for our main feature, Borrower's Annual Percentage Rate (APR)
fig = px.histogram(data_frame=loan, x='BorrowerAPR', nbins=20,
labels={'BorrowerAPR': 'Borrower\'s Annual Percentage Rate (APR)'},
title = 'DISTRIBUTION OF ' + 'Borrower\'s Annual Percentage Rate (APR)'.upper())
fig.show()
The distribution indicates that it is multimodal. Across the distribution, we see about four different peak periods. One is between 0.08 & 0.09. The 2nd is between 0.2 & 0.22. The third is between 0.28 & 0.29. And the last one, with the most counts, is between 0.34 & 0.36. The distribution mainly spans between 0.1 & 0.4
#Let's check for the distribution for Borrower Rate
fig = px.histogram(data_frame=loan, x='BorrowerRate', nbins=20,
labels={'BorrowerRate': 'Borrower\'s Rate'},
title = 'DISTRIBUTION OF ' + 'Borrower\'s Rate'.upper())
fig.show()
The distribution for the Borrower's Rate is almost identical to that of Borrower's Annual Percentage Rate (APR). They share the similar pattern of having multiple modes within the same range.
#Let's see the distribution for Monthly Income
fig = px.histogram(data_frame=loan, x='StatedMonthlyIncome',
labels={'StatedMonthlyIncome': 'Stated Monthly Income'},
title = 'HISTOGRAM SHOWING STATED MONTHLY INCOME')
fig.update_traces(xbins=dict( # bins used for histogram
start=0.0,
end=50000.0,
size=500
))
fig.show()
The distribution is skewed to the right between 0 and 15,000. This indicates the income range for most of the borrowers.
#Let's see the distribution for Debt to Income Ratio
fig = px.histogram(data_frame=loan, x='DebtToIncomeRatio',
labels={'DebtToIncomeRatio': 'Debt To Income Ratio'},
title = 'DISTRIBUTION OF DEBT TO INCOME RATIO')
fig.update_traces(xbins=dict( # bins used for histogram
start=0.0,
end=2.0,
size=0.05
))
fig.show()
The distribution shows that the Debt to Income ratio is skewed to the right and it's between 0 & 1. It also has peak that's between 0.25 & 0.29. It indicates that most borrowers prefer a ratio between 0.25 & 0.29.
#Let's check for the distribution for Loan Original Amount
fig = px.histogram(data_frame=loan, x='LoanOriginalAmount', nbins=30,
labels={'LoanOriginalAmount': 'Loan Original Amount'},
title = 'DISTRIBUTION OF ' + 'Loan Original Amount'.upper())
fig.show()
In the distribution above, we see multiple modes. The most loan amount is between 4,000 & 6,000. The multiple mode situation suggests that people like to borrow in the multiples of 5,000
# Lets's see the distribution for Employee Status & Prosper Ratings
# We will use a pie chart
fig = px.pie(data_frame=loan, names='EmploymentStatus',
labels={'EmploymentStatus': 'Employment Status'},
title = 'EMPLOYMENT STATUS')
fig.show()
From the pie chart, we see that almost 80% of borrowers are employes which suggests that being employed is significant to getting a loan.
fig = px.pie(data_frame=loan, names='ProsperRating (Alpha)',
labels={'ProsperRating (Alpha)': 'Prosper Rating'},
title = 'Prosper Rating')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
From the above, we see that most of the people fall within ratings E to A. The remaining percentages are either people with the lowest credit risk (AA) or the highest credit risk (HR)
#Let's check out why peple take loans
loan_reason = loan['Listing_Category'].value_counts().rename_axis('Listing_Category').reset_index(name='Counts')
loan_reason
| Listing_Category | Counts | |
|---|---|---|
| 0 | Debt Consolidation | 52496 |
| 1 | Other | 9172 |
| 2 | Home Improvement | 6756 |
| 3 | Business | 5266 |
| 4 | Auto | 2233 |
| 5 | Household Expenses | 1987 |
| 6 | Medical/Dental | 1507 |
| 7 | Taxes | 882 |
| 8 | Large Purchases | 863 |
| 9 | Vacation | 764 |
| 10 | Wedding Loans | 762 |
| 11 | Motorcycle | 304 |
| 12 | Student Use | 274 |
| 13 | Engagement Ring | 214 |
| 14 | Baby&Adoption | 196 |
| 15 | Cosmetic Procedure | 91 |
| 16 | Boat | 85 |
| 17 | Green Loans | 58 |
| 18 | RV | 52 |
| 19 | Not available | 20 |
# Let's visualize it
fig = px.bar(loan_reason, x="Listing_Category", y="Counts", height=1000,
labels={'Listing_Category':'Reasons for Borrowing', 'Counts':'Number of times featured'},
title = 'THE REASONS WHY PEOPLE TAKE PROSPER LOANS', width = 1000, text = 'Counts',
color_discrete_sequence =['palegreen']*len(loan_reason))
fig.show()
From the chart, most people borrow because they want to consolidate debts. Borrowing more to consolidate debts doesn't sound healthy.
#Let's see the Top 20 occupations of the borrowers.
top_occupation = loan['Occupation'].value_counts().rename_axis('Occupation').reset_index(name='Counts').head(20)
top_occupation
| Occupation | Counts | |
|---|---|---|
| 0 | Other | 21122 |
| 1 | Professional | 10445 |
| 2 | Executive | 3437 |
| 3 | Computer Programmer | 3200 |
| 4 | Teacher | 2858 |
| 5 | Analyst | 2711 |
| 6 | Administrative Assistant | 2697 |
| 7 | Accountant/CPA | 2550 |
| 8 | Sales - Commission | 2318 |
| 9 | Skilled Labor | 2157 |
| 10 | Nurse (RN) | 2145 |
| 11 | Clerical | 2087 |
| 12 | Sales - Retail | 2012 |
| 13 | Retail Management | 1978 |
| 14 | Truck Driver | 1357 |
| 15 | Construction | 1313 |
| 16 | Unknown | 1274 |
| 17 | Police Officer/Correction Officer | 1260 |
| 18 | Laborer | 1200 |
| 19 | Civil Service | 1127 |
# Let's visualize it
fig = px.bar(top_occupation, x="Occupation", y="Counts", height=1000,
labels={'Occupation':'Occupation', 'Counts':'Number of times featured'},
title = 'TOP 20 OCCUPATIONS THAT RECEIVE LOANS', width = 1000, text = 'Counts',
color_discrete_sequence =['magenta']*len(top_occupation))
fig.show()
From the chart, most people did not state their occupation. 2nd to that group of people are those who are professionals.
The distribution for the Borrower's Rate is almost identical to that of Borrower's Annual Percentage Rate (APR). They share the similar pattern of having multiple modes within the same range.
The distributions for the Debt to Monthly Ratio & Stated Montly Income were skewed to the right.
80% of borrowers are employed. It suggests that employment status is one of the criteria for loan allocation.
There no unusual distributions and I did not perform any further operation to tidy, adjust, or change the form of the data.
#Let's check for the correlation between different variables in the data.
#This correlation test can take multiple formats.
#Using Heatmap for the correlation.
df_corr = loan.corr()
df_corr
| ListingNumber | LoanOriginalAmount | BorrowerAPR | BorrowerRate | StatedMonthlyIncome | Term | TotalInquiries | DebtToIncomeRatio | MonthlyLoanPayment | TotalTrades | Investors | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| ListingNumber | 1.000000 | 0.339924 | -0.306740 | -0.322090 | 0.035377 | 0.190848 | -0.001702 | 0.012404 | 0.278137 | 0.086464 | -0.297524 |
| LoanOriginalAmount | 0.339924 | 1.000000 | -0.426520 | -0.413597 | 0.182135 | 0.340189 | -0.034931 | -0.017469 | 0.916153 | 0.165978 | 0.321658 |
| BorrowerAPR | -0.306740 | -0.426520 | 1.000000 | 0.993320 | -0.093773 | -0.068346 | 0.176019 | 0.122035 | -0.321105 | -0.073487 | -0.267396 |
| BorrowerRate | -0.322090 | -0.413597 | 0.993320 | 1.000000 | -0.093338 | -0.000336 | 0.175312 | 0.119732 | -0.330978 | -0.071315 | -0.244684 |
| StatedMonthlyIncome | 0.035377 | 0.182135 | -0.093773 | -0.093338 | 1.000000 | 0.009313 | 0.077871 | -0.082674 | 0.182273 | 0.140539 | 0.070522 |
| Term | 0.190848 | 0.340189 | -0.068346 | -0.000336 | 0.009313 | 1.000000 | -0.019424 | -0.000120 | 0.055873 | 0.060759 | 0.012824 |
| TotalInquiries | -0.001702 | -0.034931 | 0.176019 | 0.175312 | 0.077871 | -0.019424 | 1.000000 | -0.024974 | -0.010418 | 0.215279 | -0.048337 |
| DebtToIncomeRatio | 0.012404 | -0.017469 | 0.122035 | 0.119732 | -0.082674 | -0.000120 | -0.024974 | 1.000000 | 0.000907 | 0.075854 | -0.036931 |
| MonthlyLoanPayment | 0.278137 | 0.916153 | -0.321105 | -0.330978 | 0.182273 | 0.055873 | -0.010418 | 0.000907 | 1.000000 | 0.150571 | 0.307513 |
| TotalTrades | 0.086464 | 0.165978 | -0.073487 | -0.071315 | 0.140539 | 0.060759 | 0.215279 | 0.075854 | 0.150571 | 1.000000 | 0.032357 |
| Investors | -0.297524 | 0.321658 | -0.267396 | -0.244684 | 0.070522 | 0.012824 | -0.048337 | -0.036931 | 0.307513 | 0.032357 | 1.000000 |
fig = go.Figure()
fig.add_trace(
go.Heatmap(
x = df_corr.columns,
y = df_corr.index,
z = np.array(df_corr),
colorscale='YlOrRd'
)
)
#Further heatmap check. This time with annotation.
x = list(df_corr.columns)
y = list(df_corr.index)
z = np.array(df_corr)
fig = ff.create_annotated_heatmap(
z,
x = x,
y = y ,
annotation_text = np.around(z, decimals=2),
hoverinfo='z',
colorscale='YlOrRd'
)
fig.show()
#Using pair grid
numeric_vars = ['LoanOriginalAmount', 'BorrowerAPR', 'BorrowerRate', 'StatedMonthlyIncome','DebtToIncomeRatio']
categoric_vars = ['Term','ProsperRating (Alpha)', 'EmploymentStatus','Occupation']
g = sb.PairGrid(data = loan, vars = numeric_vars)
g = g.map_diag(plt.hist, bins = 20);
g.map_offdiag(plt.scatter)
<seaborn.axisgrid.PairGrid at 0x11cc716a0>
From the heatmap and the gridplot, we see that only a few variables are strongly correlated. Examples of varibles with good correlation are Borrower APR & Borrower Rate, Loan Original Amount & Monthly Loan Payment.
#Further check on the interaction between Borrower APR & Borrower Rate.
from pylab import rcParams
rcParams['figure.figsize'] = 15, 7
fig = sb.kdeplot(loan['BorrowerRate'], shade=True, color="b")
fig = sb.kdeplot(loan['BorrowerAPR'], shade=True, color="r")
fig.set_xlabel("BorrowerRate VS BorrowerAPR", fontsize=14)
plt.title('THE DISTRIBUTION PLOT SHOWING THE BORROWER RATE VS THE BORROWER APR\n')
Text(0.5, 1.0, 'THE DISTRIBUTION PLOT SHOWING THE BORROWER RATE VS THE BORROWER APR\n')
The plot above further solidifies that the distribution between Borrower Rate & Borrower APR are similar to each other.
#Let's check the relationship between the Borrower APR & Prosper Rating
fig = px.box(loan, y="BorrowerAPR", x="ProsperRating (Alpha)", height=1000,
labels={'BorrowerAPR':'Borrower APR', 'ProsperRating (Alpha)':'Prosper Rating'},
title = 'BORROWER APR vs PROSPER RATING', width = 1000,
color_discrete_sequence =['tomato']*len(loan))
fig.show()
We see a linear relationship between the Borrower APR & the Prosper Ratings. The lower the Borrower's APR, the better the Prosper Ratings.
#Let's check the relationship between the Loan Original Amount & Term
fig = px.box(loan, y="LoanOriginalAmount", x="Term", height=1000,
labels={'LoanOriginalAmount':'Loan Original Amount', 'Term':'Terms'},
title = 'LOAN ORIGINAL AMOUNT vs TERM', width = 1000,
color_discrete_sequence =['mediumorchid']*len(loan))
fig.show()
There's a linear relationship between the Original Loan Amount and the tenure of the loan. The longer the term, the higher the amount borrowed.
#Let's check the relationship between the Stated Monthly Income & Employee Status
fig = px.box(loan, y="LoanOriginalAmount", x="EmploymentStatus", height=1000,
labels={'LoanOriginalAmount':'Loan Original Amount', 'EmploymentStatus':'Employee Status'},
title = 'LOAN ORIGINAL AMOUNT vs EMPLOYEE STATUS', width = 1000,
color_discrete_sequence =['olivedrab']*len(loan))
fig.show()
The above image shows a significant correlation between the Original Loan Amount & the employee status. The employed people enjoy more original amout. It further solidifies the earlier assertion that the employee status is a key criteria to loan disbursement.
#Let's see the relationships between three categorical features.
plt.figure(figsize = [10, 12])
# subplot 1: Prosper rating vs term
plt.subplot(3, 1, 1)
sb.countplot(data = loan, x = 'ProsperRating (Alpha)', hue = 'Term', palette = 'Blues')
# subplot 2: employment status vs. term
ax = plt.subplot(3, 1, 2)
sb.countplot(data = loan, x = 'EmploymentStatus', hue = 'Term', palette = 'Blues')
plt.xticks(rotation=10)
# subplot 3: Prosper rating vs. employment status, use different color palette
ax = plt.subplot(3, 1, 3)
sb.countplot(data = loan, x = 'EmploymentStatus', hue = 'ProsperRating (Alpha)', palette = 'Greens')
ax.legend(loc = 1, ncol = 2); # re-arrange legend to remove overlapping
plt.xticks(rotation=10);
From the interactions above, we see that most people like to take loans of 3 years and above. Also, those under the "HR" rating only took three years term loans.
Further comparison was done between Borrower APR & Borrower Rate that confirmed the similarities in distribution between the two variables. The Borrower APR & Borrower Rate are strongly correlated.
There's a linear relationship between the Borrower APR & the Prosper Ratings. The lower the Borrower's APR, the better the Prosper Ratings.
Most people like to take loans of 3 years and above. Also, those under the "HR" rating only took three years term loans.
There's a significant correlation between the Original Loan Amount & the employee status. The employed people enjoy more original amout. It further solidifies the earlier assertion that the employee status is a key criteria to loan disbursement.
There's a linear relationship between the Original Loan Amount and the tenure of the loan. The longer the term, the higher the amount borrowed.
There's a strong correlation between Loan Original Amount & Monthly Loan Payment.
# Month effect on relationship of APR and loan amount
g=sb.FacetGrid(data= loan, aspect=1.2, height=5, col='Month', col_wrap=4)
g.map(sb.regplot, 'LoanOriginalAmount', 'BorrowerAPR', x_jitter=0.04, scatter_kws={'alpha':0.1});
g.add_legend();
Month does not appear to have an impact on the Loan Amount & Borrower APR
# Year effect on relationship of APR and loan amount
g=sb.FacetGrid(data= loan, aspect=1.2, height=5, col='Year', col_wrap=4)
g.map(sb.regplot, 'LoanOriginalAmount', 'BorrowerAPR', x_jitter=0.04, scatter_kws={'alpha':0.1});
g.add_legend();
Year also does not appear to have any major impact on the Loan Amount & Borrower APR
# Prosper rating effect on relationship of APR and loan amount
g=sb.FacetGrid(data=loan, aspect=1.2, height=5, col='ProsperRating (Alpha)', col_wrap=4)
g.map(sb.regplot, 'LoanOriginalAmount', 'BorrowerAPR', x_jitter=0.04, scatter_kws={'alpha':0.1});
g.add_legend();
There's an increase in loan amount as the rating gets better. There's also reduced Borrower APR as the rating gets better.
#Let's check the rating and term effects on stated monthly income and loan original amount variables
rate_order = ['A', 'E', 'C', 'AA', 'B', 'D', 'HR']
ordered_var = pd.api.types.CategoricalDtype(ordered = True,
categories = rate_order)
loan['ProsperRating (Alpha)'] = loan['ProsperRating (Alpha)'].astype(ordered_var)
fig, ax = plt.subplots(ncols=2, figsize=[12,6])
sb.pointplot(data = loan, x = 'ProsperRating (Alpha)', y = 'StatedMonthlyIncome', hue = 'Term',
palette = 'Purples', linestyles = '', dodge = 0.4, ax=ax[0])
sb.pointplot(data = loan, x = 'ProsperRating (Alpha)', y = 'LoanOriginalAmount', hue = 'Term',
palette = 'Greens', linestyles = '', dodge = 0.4, ax=ax[1]);
We can see that with better Prosper rating, the loan amount of all three terms increases, the increase amplitude of loan amount between terms also becomes larger.
There's an increase in loan amount as the rating gets better. There's also reduced Borrower APR as the rating gets better.
Month does not appear to have an impact on the Loan Amount & Borrower APR
Year also does not appear to have any major impact on the Loan Amount & Borrower APR
With better Prosper rating, the loan amount of all three terms increases, the increase amplitude of loan amount between terms also becomes larger.
The distribution for the Borrower's Rate is almost identical to that of Borrower's Annual Percentage Rate (APR). They share the similar pattern of having multiple modes within the same range. Both variables are strongly correlated.
Most people like to take loans of 3 years and above. Also, those under the "HR" rating only took three years term loans.
There's a significant correlation between the Original Loan Amount & the employee status. The employed people enjoy more original amout. It further solidifies the earlier assertion that the employee status is a key criteria to loan disbursement.
Almost 80% of borrowers are employes which suggests that being employed is significant to getting a loan.
With better Prosper rating, the loan amount of all three terms increases, the increase amplitude of loan amount between terms also becomes larger.
#Save the cleaned dataset for explanatory data analysis
loan.to_csv('WrangledLoanData.csv')